--- toc: true title: Scooter Exploration image: images/company_logo.png keywords: fastai sidebar: home_sidebar summary: "In this chapter Scooter data is explored" description: "In this chapter Scooter data is explored" nb_path: "notebooks/02_scooterExploration.ipynb" ---
This chapter has brought to you in collaboration by Brian Kelly, Michael Vandi, Logan Shertz, and Charles Karpati.
Dataset: Scooter data:
# (Optional) Run this cell to gain access to Google Drive (Colabs only)
from google.colab import drive
# Colabs operates in a virtualized enviornment
# Colabs default directory is at ~/content.
# We mount Drive into a temporary folder at '~/content/drive'
drive.mount('/content/drive')
File path's will vary
# cd ./drive/'My Drive'/BNIA/'Scooter Use Data'/BNIA
cd ./Routes
! ls
!cd ../ && ls
%%capture
! pip install dexplot
!pip install folium
!pip install geopandas
!pip install ipyleaflet
!pip install gpdvega
!pip install dataplay
%%capture
import os
import pandas as pd
import geopandas as gpd
import dexplot as dxp
import folium as fol
import json
import altair as alt
import gpdvega
import dataplay
# These imports will handle everything
import os
import sys
import csv
from IPython.display import clear_output
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame
import psycopg2
import pyproj
from pyproj import Proj, transform
# conda install -c conda-forge proj4
from shapely.geometry import Point
from shapely import wkb
from shapely.wkt import loads
# https://pypi.org/project/geopy/
from geopy.geocoders import Nominatim
import folium
from folium import plugins
from dataplay.merge import mergeDatasets
import dexplot as dxp
# In case file is KML, enable support
import fiona
fiona.drvsupport.supported_drivers['kml'] = 'rw'
fiona.drvsupport.supported_drivers['KML'] = 'rw'
#this cell is good to copy
from shapely.geometry import LineString
pd.plotting.register_matplotlib_converters()
from dataplay.geoms import readInGeometryData
from shapely import wkt
from dataplay.geoms import readInGeometryData
import ipywidgets as widgets
!jupyter nbextension enable --py widgetsnbextension
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
alt.data_transformers.enable('default', max_rows=None)
def split(geom):
print( str( type(geom)) )
#Linestring might not be the actual type, so this may need to be fied.
#IF its not a linestring then dont run it and stuff 'false' and the datatype
if str( type(geom)) == "<class 'shapely.geometry.linestring.LineString'>" and not str(geom.boundary) == 'MULTIPOINT EMPTY':
print(geom.boundary)
left, right = geom.boundary
print('left x: ', type(left.x), left.x)
print('left y: ', type(left.y), left.y)
print('right x: ', type(right.x), right.x)
print('right y: ', type(right.y), right.y)
return left.x, left.y, right.x, right.y
else: return False, type(geom), False, type(geom)
def getPointsInPolygons(pts, polygons, ptsCoordCol, polygonsCoordCol):
count = 0
total = pts.size / len(pts.columns)
# We're going to keep a list of how many points we find.
pts_in_polys = []
# Loop over polygons with index i.
for i, poly in polygons.iterrows():
# print('Searching for point within Geom:', poly )
# Keep a list of points in this poly
pts_in_this_poly = []
# Now loop over all points with index j.
for j, pt in pts.iterrows():
if poly[polygonsCoordCol].contains(pt[ptsCoordCol]):
# Then it's a hit! Add it to the list,
# and drop it so we have less hunting.
pts_in_this_poly.append(pt[ptsCoordCol])
pts = pts.drop([j])
# We could do all sorts, like grab a property of the
# points, but let's just append the number of them.
pts_in_polys.append(len(pts_in_this_poly))
print('Found this many points within the Geom:', len(pts_in_this_poly) )
count = count + len(pts_in_this_poly)
clear_output(wait=True)
# Add the number of points for each poly to the dataframe.
polygons['pointsinpolygon'] = gpd.GeoSeries(pts_in_polys)
print( 'Total Points: ', total )
print( 'Total Points in Polygons: ', count )
print( 'Prcnt Points in Polygons: ', count / total )
return polygons
def findFile(root, file):
for d, subD, f in os.walk(root):
if file in f:
return "{1}/{0}".format(file, d)
break
# To 'import' a script you wrote, map its filepath into the sys
def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))
findFile('./', 'Routing September 2019.geojson')
ls
ls 'Trip origins-destinations by month'
#@markdown Forms support many types of fields.
fileName = "Trip Origins by block September 2019.geojson" #@param ['Daily Deployment average by block August 2019.csv', 'Daily Deployment average by block December 2019.csv', 'Daily Deployment average by block November 2019.csv', 'Daily Deployment average by block October 2019.csv', 'Daily Deployment average by block September 2019.csv', 'Trip Destinations by block August 2019.geojson','Trip Destinations by block December 2019.geojson','Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson','Trip Origins by block December 2019.geojson','Trip Origins by block November 2019.geojson','Trip Origins by block October 2019.geojson','Trip Origins by block September 2019.geojson']
#@markdown ---
gdf = gpd.read_file( findFile('./', fileName) )
gdf.head()
gdf.plot(column = 'value')
point_df = gdf.copy()
point_df['centroids'] = df.centroid
point_df = point_df.drop(columns = 'geometry')
point_df = point_df.set_geometry('centroids')
point_df.head(1)
point_df.plot(marker='o', color='red', markersize='value')
point_df.value.hist()
point_df[point_df.value > 1000].plot()
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")
from dataplay.geoms import workWithGeometryData
baltimore.columns
pointsInPolys = workWithGeometryData(method = 'ponp', df = point_df, polys = baltimore, ptsCoordCol ='centroids',
polygonsCoordCol = 'geometry', polygonsLabel = 'CSA2010')
pointsInPolys.plot(column='value', legend=True, markersize = 1)
ponp_copy = pointsInPolys.copy()
ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int).reset_index(name='NumberMissingCount').head()
ponp_copy.value.notnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int)
ponp_copy.value.isnull().groupby([ponp_copy['CSA2010']]).sum().astype(int) / ponp_copy.value.groupby([ponp_copy['CSA2010']]).sum().astype(int) * 100
ponp_copy.fillna(-1).groupby('CSA2010')['value'].sum()
gdf.value.unique()
gdf.value.describe()
gdf.value.var() # Return unbiased variance over requested axis.
gdf.value.sem() # Return unbiased standard error of the mean over requested axis.
gdf.value.nunique() # Count distinct observations over requested axis.
gdf.shape
#DataFrame.size Return an int representing the number of elements in this object.
gdf.size
# DataFrame.ndim Return an integer representing the number of axes/array dimensions.
gdf.ndim
# Note Used :
# DataFrame.axes Return a list representing the axes of the DataFrame.
gdf.dtypes
# Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).
gdf.kurtosis()
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('max_colwidth', 20)
in_crs = 2248 # The CRS we recieve our data.
out_crs = 4326 # The CRS we would like to have our data represented as.
geom = 'geometry' # The column where our spatial information lives.
# A Url to load
boundariesBaltimoreTractsNoWater2010 = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ8xXdUaT17jkdK0MWTJpg3GOy6jMWeaXTlguXNjCSb8Vr_FanSZQRaTU-m811fQz4kyMFK5wcahMNY/pub?gid=886223646&single=true&output=csv"
# Read in the dataframe
gdf = pd.read_csv(boundariesBaltimoreTractsNoWater2010)
# Convert the geometry column datatype from a string of text into a coordinate datatype.
gdf['geometry'] = gdf['geometry'].apply(lambda x: loads( str(x) ))
# Process the dataframe as a geodataframe with a known CRS and geom column.
gdf = GeoDataFrame(gdf, crs=in_crs, geometry='geometry')
boundariesBaltimoreTractsNoWater2010.head()
Ensure merge is on consistent datatypes
gdf['GEOID10'] = gdf['GEOID10'].astype(str)
scooterdf['nameChange2'] = scooterdf['nameChange2'].astype(str)
Perform the merge
scooterdfClean = gdf.merge(scooterdf, left_on='GEOID10', right_on='nameChange2').drop(['name', 'nameChange1', 'nameChange2'], axis=1)
scooterdfClean.head()
# gdf.drop(columns='geometry').to_csv('./boundsdf.csv', index=False)
ls
scooterdfClean.groupby('CSA')['value'].sum()
scooterdfClean.value.isna().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='notApplicable')
scooterdfClean.value.notnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NotMissingCount')
scooterdfClean.value.isnull().groupby([scooterdfClean['CSA']]).sum().astype(int).reset_index(name='NumberMissingCount')
scooterdfClean.fillna(-1).groupby('CSA')['value'].sum()
scooterdfClean.groupby('CSA')['value'].mean()
scooterdfClean.groupby('CSA')['value'].sum()
scooterdfClean.CSA.value_counts()
https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html
fileName = 'Routing December 2019.geojson' #@param ['Routing August 2019.geojson', 'Routing October 2019.geojson', 'Routing December 2019.geojson', 'Routing September 2019.geojson', 'Routing November 2019.geojson']
columnName = "streetname" #@param ['id', 'color', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent']
gdf = gpd.read_file( findFile('./', fileName) )
gdf.head()
baltimore = gpd.read_file("https://opendata.arcgis.com/datasets/b738a8587b6d479a8824d937892701d8_0.geojson")
tst = """background = alt.Chart(gdf).mark_geoshape(
stroke='white',
strokeWidth=2
).encode(
color=alt.value('#eee'),
).properties(
width=700,
height=500
)"""
# GeoDataFrame could be passed as usual pd.DataFrame
city = alt.Chart(baltimore).mark_geoshape(
).project(
).encode(
color='tpop10', # shorthand infer types as for regular pd.DataFrame
tooltip='CSA2010' # GeoDataFrame.index is accessible as id
).properties(
width=500,
height=300
)
routes = alt.Chart(gdf).mark_geoshape(
filled=False,
strokeWidth=2
)
city + routes
Clean the gdf of empties.
gdf = gdf[~gdf.isna()]
gdf = gdf[~gdf.is_empty]
Now get the extremities; this will take a minute.
gdf['lefty'], gdf['leftx'],gdf['righty'], gdf['rightx'] = zip(*gdf["geometry"].map(split))
Split the gdf into a left and right dataset
gdf_left = gdf.copy()
gdf_left = gdf_left.drop(columns = ['geometry','rightx', 'righty'])
gdf_right= gdf.copy()
gdf_right = gdf_right.drop(columns = ['geometry','leftx', 'lefty', 'streetname', 'trip_count_sum', 'trip_count_average', 'trip_count_percent', 'color' ])
The coordinate variables of object type will cause problems.
gdf_right.dtypes
Let's go ahead and coerce a correction.
gdf_right['righty']=pd.to_numeric(gdf_right['righty'], errors='coerce')
gdf_left['lefty']=pd.to_numeric(gdf_left['lefty'], errors='coerce')
gdf_right['rightx']=pd.to_numeric(gdf_right['rightx'], errors='coerce')
gdf_left['leftx']=pd.to_numeric(gdf_left['leftx'], errors='coerce')
Now we can view the results
gdf_right.dtypes
Save these csv's because it took a minute to get to where we are now.
gdf_right.to_csv('rightRouts.csv')
gdf_left.to_csv('leftRouts.csv')
Convert the datasets to geodataframes for further exploration!
#temp_gdf = gpd.GeoDataFrame( gdf_right, geometry=gpd.points_from_xy(gdf_right.rightx, gdf_right.righty) )
#temp_gdf.head()
# Alternately this could work.. unfinished.. but wkt.loads can make a Point from text
# gdf_right['strCol']=gdf_right['rightx'].astype(str)
# gdf_right['geometry'] = gdf_right['strCol'].apply(wkt.loads)
left_csaMap = readInGeometryData(url=gdf_left, porg='p', geom= False, lat= 'leftx', lng= 'lefty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)
right_csaMap = readInGeometryData(url=gdf_right, porg='p', geom= False, lat= 'rightx', lng= 'righty', revgeocode=False, save=False, in_crs=4268, out_crs=4268)
right_csaMap.head()
left_csaMap.head()
left_csaMap.plot(column='trip_count_sum')
right_csaMap.plot()
baltimore.columns
right_points_in_poly = getPointsInPolygons(right_csaMap, baltimore, 'geometry', 'geometry')
right_csaMap.head()
left_points_in_poly = getPointsInPolygons(left_csaMap, baltimore, 'geometry', 'geometry')
left_csaMap.head()
left_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
right_points_in_poly.drop('geometry', axis=1)[['pointsinpolygon']].head(20)
left_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly.plot(column='pointsinpolygon', legend=True)
right_points_in_poly
scooterdfClean = left_points_in_poly.merge(right_points_in_poly, left_on='CSA2010', right_on='CSA2010', how='left')
scooterdfClean.columns
right_points_in_poly.columns
right_points_in_poly.head()
right_points_in_poly.plot('pointsinpolygon', legend=True)
import altair as alt
import geopandas as gpd
import gpdvega
# GeoDataFrame could be passed as usual pd.DataFrame
chart = alt.Chart(right_points_in_poly).mark_geoshape(
).project(
).encode(
color='pointsinpolygon', # shorthand infer types as for regular pd.DataFrame
tooltip=['CSA2010','pointsinpolygon'] # GeoDataFrame.index is accessible as id
).properties(
width=500,
height=300
)
routes = alt.Chart(gdf).mark_geoshape(
filled=False,
strokeWidth=2
)
chart + routes
chart.save(fileName[:-8]+'.html', embed_options={'renderer':'svg'})
chart.save(fileName[:-8]+'.json')
ls 'Trip origins-destinations by month'
#@markdown Forms support many types of fields.
fileName = 'Trip Destinations by block August 2019.geojson' #@param ['Trip Destinations by block August 2019.geojson', 'Trip Destinations by block December 2019.geojson', 'Trip Destinations by block November 2019.geojson', 'Trip Destinations by block October 2019.geojson', 'Trip Destinations by block September 2019.geojson', 'Trip Origins by block August 2019.geojson', 'Trip Origins by block December 2019.geojson', 'Trip Origins by block November 2019.geojson', 'Trip Origins by block October 2019.geojson', 'Trip Origins by block September 2019.geojson']
columnName = "name" #@param ['name', 'value', 'color', 'radius']
#@markdown ---
gdf = gpd.read_file( findFile('./', fileName) )
gdf.plot( column = columnName)
gdf.columns
gdf[['id','name', 'value', 'color', 'radius']].head(5)
dxp.bar(x='color', y='value', data=gdf, aggfunc='median')
dxp.scatter(x = "color", y = "value", data = gdf, aggfunc = "median")